#Infer ASVs
##Infer with DADA2 in QIIME2
src/03.create_manifest.sh
src/04.import_data.sh
qiime cutadapt trim-paired --i-demultiplexed-sequences results/01.demux_import_data.qza --p-front-f CCTACGGGNGGCWGCAG --p-front-r GACTACHVGGGTATCTAATCC --o-trimmed-sequences results/02.demux_remove_adapter.qza
qiime demux summarize --i-data results/02.demux_remove_adapter.qza --o-visualization results/02.demux_remove_adapter.qzv
qiime dada2 denoise-paired --i-demultiplexed-seqs results/02.demux_remove_adapter.qza --p-trunc-len-f 260 --p-trunc-len-r 280 --p-max-ee-f 4.0 --p-max-ee-r 4.0 --p-min-overlap 8 --p-pooling-method pseudo --o-representative-sequences results/v1_260-280/rep-seqs-v1-260-280.qza --o-table results/v1_260-280/feature-table_v1-260-280.qza --o-denoising-stats results/v1_260-280/denoising-stats_v1-260-280.qza --p-n-threads 60
qiime tools export --input-path results/v1_260-280/denoising-stats_v1-260-280.qza --output-path results/v1_260-280/denoising-stats_v1-260-280
qiime dada2 denoise-paired --i-demultiplexed-seqs results/02.demux_remove_adapter.qza --p-trunc-len-f 250 --p-trunc-len-r 200 --p-max-ee-f 4.0 --p-max-ee-r 4.0 --p-min-overlap 8 --p-pooling-method pseudo --o-representative-sequences results/v2_250-200/rep-seqs-v2-250-200.qza --o-table results/v2_250-200/feature-table_v2-250-200.qza --o-denoising-stats results/v2_250-200/denoising-stats_v2-250-200.qza --p-n-threads 80
qiime tools export --input-path results/v2_250-200/denoising-stats_v2-250-200.qza --output-path results/v2_250-200/denoising-stats_v2-250-200
Prueba con los datos limpios con trimgalore, se repitió la impofrtación y remoción de adaptadores
qiime dada2 denoise-paired --i-demultiplexed-seqs results/02.demux_remove_adapter_trim.qza --p-trunc-len-f 250 --p-trunc-len-r 200 --p-max-ee-f 4.0 --p-max-ee-r 4.0 --p-min-overlap 8 --p-pooling-method pseudo --o-representative-sequences results/v2_250-200/rep-seqs-v2-250-200-trim.qza --o-table results/v2_250-200/feature-table_v2-250-200-trim.qza --o-denoising-stats results/v2_250-200/denoising-stats_v2-250-200-trim.qza --p-n-threads 80
qiime tools export --input-path results/v2_250-200/denoising-stats_v2-250-200-trim.qza --output-path results/v2_250-200/denoising-stats_v2-250-200-trim
##Infer ASVs with DADA2 in R
library(dada2)
#Load trim fastq files and list fastq_path content
fastq_path <- "/axolote/diana/diana/macrogen_all/pisolithus/results/03.cutadapt/"
#Sort file names
Fs <- sort(list.files(fastq_path, pattern="_1.fastq"))
Rs <- sort(list.files(fastq_path, pattern="_2.fastq"))
# Extract sample names
sampleNames <- sapply(strsplit(Fs, "_1"), `[`, 1)
sampleNames
# Add complete path to remove ambiguities errors
Fs <- file.path(fastq_path, Fs)
Rs <- file.path(fastq_path, Rs)
# Quality check plot with only the first fastq file
QCplot_1 <- plotQualityProfile(c(rbind(Fs[40],Rs[40])))
#pdf("results/plots/02.dada/01.QCplot_1.pdf")
QCplot_1
#dev.off()
QCplot_2 <- plotQualityProfile(c(rbind(Fs[40],Rs[40])))
## Filter
### explore trunclen versions
# Create directory for clean reads
filt_path <- file.path("results/04.Dada2" , "filter_reads")
if(!file_test("-d", filt_path))
dir.create(filt_path)
filtFs <- file.path(filt_path, paste0(sampleNames, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_path, paste0(sampleNames, "_R_filt.fastq.gz"))
Comparar versiones de trunc length
# Quality control
# V1
out1 <- filterAndTrim(Fs, filtFs, Rs, filtRs,
truncLen=c(280,200),
maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE)
#V2 extra permissive
out2 <- filterAndTrim(Fs, filtFs, Rs, filtRs,
truncLen=c(250,200),
maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE)
#V3
out3 <- filterAndTrim(Fs, filtFs, Rs, filtRs,
truncLen=c(0,200),
maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE)
#convert double as DataFrame
v1 <- as.data.frame(out1)
v2 <- as.data.frame(out2)
v3 <- as.data.frame(out3)
# percentage function
calculate_percentage <- function(df, group_name) {
df$percentage <- (df$reads.out / df$reads.in) * 100
df$version <- group_name
return(df)
}
# Get percentage to each version
out1_with_percentage <- calculate_percentage(v1, 'out1')
out2_with_percentage <- calculate_percentage(v2, 'out2')
out3_with_percentage <- calculate_percentage(v3, 'out3')
# Combine
combined_data <- rbind(out1_with_percentage, out2_with_percentage, out3_with_percentage)
# Compare in a Boxplot
library(ggplot2)
filter_versions <- ggplot(combined_data, aes(x = version, y = percentage, fill = version)) +
geom_boxplot() +
labs(x = "Filter version", y = "Percentage of reads after filter") +
theme_bw() +
scale_fill_brewer(palette = "Set2")
pdf("results/plots/02.dada/02.Filter_versions.pdf")
filter_versions
dev.off()## png
## 2
filter_versions#Save info of final version
write.table(out2, file="results/04.Dada2/Dada_clean.tsv", quote=F, sep="\t",col.names=NA) # Table with the totals before and after cleaning
#De-replicate to reduce redundance
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)
# Add names to de-rep object
names(derepFs) <- sampleNames
names(derepRs) <- sampleNames
#Generate error model
errF <- learnErrors(derepFs, multithread=TRUE, verbose=TRUE)
errR <- learnErrors(derepRs, multithread=TRUE, verbose=TRUE)
pseudo pooling explanation here
dadaFs <- dada(derepFs, err=errF, multithread=TRUE, pool = "pseudo", verbose=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE, pool = "pseudo", verbose = TRUE)
# Merge pairs
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, minOverlap = 8, verbose=TRUE)
# Create ASVs table
seqtabAll <- makeSequenceTable(mergers)
table(nchar(getSequences(seqtabAll)))
# Remove chimeras
seqtab_nochim <- removeBimeraDenovo(seqtabAll, method="consensus", multithread=TRUE, verbose=TRUE)
# create a new table with each ASV number and its representative sequence
PE.table_tsv_output <- seqtab_nochim
PE.table_tsv_output[PE.table_tsv_output==1]=0 # Don't consider those values that have a single observation per sample, make them 0 (sample singletons)
PE.table_tsv_output <- PE.table_tsv_output[,colSums(PE.table_tsv_output)>1] # filter singleton ASVs across the table
# Export sequences as in fasta format
uniquesToFasta(PE.table_tsv_output, fout="results/04.Dada2/ASVs.fasta", ids=paste("ASV_",1:ncol(PE.table_tsv_output), sep=""))
nochim=PE.table_tsv_output
write.table(cbind("ASVs"=1:nrow(t(PE.table_tsv_output)),"rep_seq"=rownames(t(PE.table_tsv_output))), file="results/04.Dada2/ASV_to_seqs-nochim.tsv", quote=F, sep="\t",row.names=FALSE)
# replace the rep_seq with an incremental ASV number
PE.table_tsv_output <- t(PE.table_tsv_output)
rownames(PE.table_tsv_output) <- paste0("ASV_",1:nrow(PE.table_tsv_output))
# and save ASV table
write.table(PE.table_tsv_output, file="results/04.Dada2/ASV_to_seqs-nochim.tsv", quote=F, sep="\t",col.names=NA)
# By using this, we can create a function to automate this for all samples in a set:
getN <- function(x) sum(getUniques(x)) # Where getUniques gets non-repeated sequences from a dada2 object or merger object (joined reads)
track <- cbind(out1, sapply(derepFs, getN), sapply(dadaFs, getN), sapply(dadaRs, getN), rowSums(seqtabAll), rowSums(nochim))
colnames(track) <- c("Raw", "Qual_filter", "Derep", "ASVs R1", "ASVs R2", "Merged", "nonchim")
rownames(track) <- sampleNames
write.table(track, "results/04.Dada2/Seqs_lost_in_ASVs_processing.tsv", col.names=NA, sep="\t")
# Create a quick assesment of sequences lost throughout the process
pdf("results/plots/02.dada/03.sequences_throughout_ASV_process.pdf")
# And same thing for the percentage remaining
matplot(t(track[,-5]/track[,1]*100),type='l',xaxt='n', main="Sequences remaining after each step - R1 (%)", xlab="Step", ylab=" Percentage of Sequences remaining")
axis(1,at=1:ncol(track[,-5]),labels=colnames(track[,-5]))
# R2
matplot(t(track[,-4]/track[,1]*100),type='l',xaxt='n', main="Sequences remaining after each step - R2 (%)", xlab="Step", ylab=" Percentage of Sequences remaining")
axis(1,at=1:ncol(track[,-4]),labels=colnames(track[,-4]))
dev.off()
##Add final table
track2 <- data.frame(track)
track2$percentage_used <-(track2$nonchim / track2$Raw) * 100
track2
write.table(track2, "results/04.Dada2/Seqs_lost_in_ASVs_processing_percentage.tsv", col.names=NA, sep="\t")
# Save work so far
save.image(file = "Dada2.RData")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt ## /axolote/diana/.local/lib/python3.10/site-packages/matplotlib/projections/__init__.py:63: UserWarning: Unable to import Axes3D. This may be due to multiple versions of Matplotlib being installed (e.g. as a system package and as a pip package). As a result, the 3D projection is not available.
## warnings.warn("Unable to import Axes3D. This may be due to multiple versions of "
import seaborn as sns
v1 = pd.read_csv('results/01.qiime/v1_260-280/denoising-stats_v1-260-280.qzv/stats.tsv',sep = '\t')
v2 = pd.read_csv('results/01.qiime/v2_250-200/denoising-stats_v2-250-200/stats.tsv',sep = '\t')
v3 = pd.read_csv('results/01.qiime/v2_250-200/denoising-stats_v2-250-200-trim/stats.tsv',sep = '\t')
v4 = pd.read_csv('results/04.Dada2/Seqs_lost_in_ASVs_processing_percentage.tsv',sep = '\t')def compare_table(sample_id,v1,v2,v3,column_names):
table = pd.DataFrame()
table['sample-id']=sample_id
for i, versions in enumerate([v1,v2,v3]):
column_name = column_names[i]
table[column_name] = versions
return table
column_names = ["v1","v2","v3"]
denoising_compare_versions = compare_table(v1['sample-id'],v1['percentage of input non-chimeric'], v2['percentage of input non-chimeric'], v3['percentage of input non-chimeric'], column_names)
denoising_compare_versions['dadaR']=v4['percentage_used']
denoising_compare_versions## sample-id v1 v2 v3 dadaR
## 0 #q2:types numeric numeric numeric NaN
## 1 DR1014-E 49.89 87.25 89.7 93.849904
## 2 DR1014-SAH 25.01 46.47 48.42 49.732411
## 3 DR1015-SAH 23.81 45.81 47.51 48.452470
## 4 DR1018-E 41.54 74.95 79.7 83.522255
## .. ... ... ... ... ...
## 79 SAH-5 8.12 51.63 54.49 55.626792
## 80 SAH-6 7.32 54.69 58.04 59.153486
## 81 SAH-7 5.27 51.98 55.82 57.347706
## 82 SAH-8 5.29 47.8 50.69 52.059677
## 83 SAH-9 7.53 48.97 51.53 52.320221
##
## [84 rows x 5 columns]
denoising_compare_versions.drop(0, inplace=True)denoising_compare_versions## sample-id v1 v2 v3 dadaR
## 1 DR1014-E 49.89 87.25 89.7 93.849904
## 2 DR1014-SAH 25.01 46.47 48.42 49.732411
## 3 DR1015-SAH 23.81 45.81 47.51 48.452470
## 4 DR1018-E 41.54 74.95 79.7 83.522255
## 5 DR1018-SAH 23.54 44.9 46.54 47.467611
## .. ... ... ... ... ...
## 79 SAH-5 8.12 51.63 54.49 55.626792
## 80 SAH-6 7.32 54.69 58.04 59.153486
## 81 SAH-7 5.27 51.98 55.82 57.347706
## 82 SAH-8 5.29 47.8 50.69 52.059677
## 83 SAH-9 7.53 48.97 51.53 52.320221
##
## [83 rows x 5 columns]
def get_boxplot(tabla, title="Percentage of Effective Reads", xlabel="Denoising Versions", ylabel="Percentage of Effective Reads" + "\n" + "after clean, merge, and non-chimeric"):
if tabla.empty or len(tabla.columns) < 2:
print("the table does not have sufficient data to get boxplot.")
return
fig, ax = plt.subplots()
sns.boxplot(data=tabla, ax=ax)
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.savefig('results/plots/denoising_versions_boxplot.png')
plt.show()
get_boxplot(versions)Train the Silva database taking V3-V4 amplicon to increase the taxonomic assignment resolution.
mkdir -p data/dbs
#get the database and check its integrity
wget https://data.qiime2.org/2023.5/common/silva-138-99-seqs.qza
md5sum data/dbs/silva-138-99-seqs.qza
de8886bb2c059b1e8752255d271f3010 data/dbs/silva-138-99-seqs.qza
wget https://data.qiime2.org/2023.5/common/silva-138-99-tax.qza
md5sum data/dbs/silva-138-99-tax.qza
f12d5b78bf4b1519721fe52803581c3d data/dbs/silva-138-99-tax.qza
conda activate qiime2-2023.5
#extract specific fragments
qiime feature-classifier extract-reads \
--i-sequences data/dbs/silva-138-99-seqs.qza \
--p-f-primer CCTACGGGNGGCWGCAG --p-r-primer GACTACHVGGGTATCTAATCC \
--p-min-length 250 --p-max-length 450 \
--o-reads data/dbs/silva-138-99-seqs-extracted.qza --p-n-jobs 40
#train the database
qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads data/dbs/silva-138-99-seqs-extracted.qza \
--i-reference-taxonomy data/dbs/silva-138-99-tax.qza \
--o-classifier data/dbs/classifier_silva_138_trained.qza
#Taxonomy in QIIME2
mkdir -p results/05.taxonomy_qiime/
qiime tools import --input-path results/04.Dada2/ASVs.fasta --type 'FeatureData[Sequence]' --output-path results/05.taxonomy_qiime/ASV_rep_seq.qza
# append missing header to the table for import
cat <(echo -n "#OTU Table") results/04.Dada2/ASV_to_seqs-nochim.tsv > results/05.taxonomy_qiime/temp.txt
# convert to biom
biom convert -i results/05.taxonomy_qiime/temp.txt -o results/05.taxonomy_qiime/temp.biom --table-type="OTU table" --to-hdf5
#create feature table
qiime tools import --input-path results/05.taxonomy_qiime/temp.biom --type 'FeatureTable[Frequency]' --input-format BIOMV210Format --output-path results/05.taxonomy_qiime/ASV_table.qza
#taxonomy assignment
qiime feature-classifier classify-sklearn --i-classifier data/dbs/classifier_silva_138_trained.qza --i-reads results/05.taxonomy_qiime/ASV_rep_seq.qza --o-classification results/05.taxonomy_qiime/taxonomy.qza --p-n-jobs 80
#get visualization
qiime metadata tabulate \
--m-input-file results/05.taxonomy_qiime/taxonomy.qza \
--o-visualization results/05.taxonomy_qiime/taxonomy.qzv
#get visual fasta to fast compare the taxonomic assignments with the top BLASTn hits for certain ASVs
qiime feature-table tabulate-seqs \
--i-data results/05.taxonomy_qiime/ASV_rep_seq.qza \
--o-visualization results/05.taxonomy_qiime/ASV_rep_seq.qzv
#Summary of the qza table imported from R
qiime feature-table summarize \
--i-table results/05.taxonomy_qiime/ASV_table.qza \
--o-visualization results/05.taxonomy_qiime/ASV_table_summary.qzv
qiime taxa filter-table --i-table results/05.taxonomy_qiime/ASV_table.qza --i-taxonomy results/05.taxonomy_qiime/taxonomy.qza --p-exclude Archaea,Eukarya, Eukaryota, mitochondria,chloroplast --p-include p__ --o-filtered-table results/05.taxonomy_qiime/ASV_table_filter_aemc.qza
qiime feature-table summarize --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc.qza --o-visualization results/05.taxonomy_qiime/ASV_table_summaryfilter_aemc.qzv
Here I removed all ASVs with a frequency of less than 0.1% of the mean sample depth. This cut-off excludes ASVs that are likely due to MiSeq bleed-through between runs (reported by Illumina to be 0.1% of reads). To calculate this cut-off I identified the mean sample depth, multiplied it by 0.001, and rounded to the nearest integer. This step are describe in this paper
qiime feature-table filter-features --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc.qza --p-min-samples 1 --p-min-frequency 62 --o-filtered-table results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza
qiime feature-table summarize --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza --o-visualization results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qzv
#remove in fasta sequences
qiime feature-table filter-seqs --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza --i-data results/05.taxonomy_qiime/ASV_rep_seq.qza --o-filtered-data results/05.taxonomy_qiime/ASV_rep_seq_filters.qza
#Exports to text format
#taxonomy
qiime tools export --input-path results/05.taxonomy_qiime/taxonomy.qza --output-path results/05.taxonomy_qiime/exports/taxonomy
#feature table
qiime tools export --input-path results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza --output-path results/05.taxonomy_qiime/exports/ASV_table
#reformat taxonomy tsv
sed -i -e '1 s/Feature/#Feature/' -e '1 s/Taxon/taxonomy/' results/05.taxonomy_qiime/exports/taxonomy/taxonomy.tsv
#Add taxonomy to feature table
biom add-metadata -i results/05.taxonomy_qiime/exports/ASV_table/feature-table.biom -o results/05.taxonomy_qiime/exports/feature-table_tax.biom --observation-metadata-fp results/05.taxonomy_qiime/exports/taxonomy/taxonomy.tsv --sc-separated results/05.taxonomy_qiime/exports/taxonomy/taxonomy.tsv
#Convert to tsv from biom format
biom convert -i results/05.taxonomy_qiime/exports/feature-table_tax.biom -o results/05.taxonomy_qiime/exports/feature-table_tax.tsv --to-tsv --header-key taxonomy
qiime phylogeny align-to-tree-mafft-iqtree \
--p-n-threads auto --i-sequences results/05.taxonomy_qiime/ASV_rep_seq_filters.qza \
--o-alignment results/05.taxonomy_qiime/align.qza \
--o-masked-alignment results/05.taxonomy_qiime/masked-align-iqtree.qza \
--o-tree results/05.taxonomy_qiime/unrooted-tree-iqtree.qza \
--o-rooted-tree results/05.taxonomy_qiime/rooted-tree-iqtree.qza --verbose
Import data to R from qiime and check prevalence to filter data
# 01. Load data
physeq_qiime2 <- qza_to_phyloseq(
features = "results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza",
tree = "results/05.taxonomy_qiime/rooted-tree-iqtree.qza",
taxonomy = "results/05.taxonomy_qiime/taxonomy.qza",
metadata = "data/metadata.tsv")
physeq_qiime2## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 9231 taxa and 83 samples ]
## sample_data() Sample Data: [ 83 samples by 8 sample variables ]
## tax_table() Taxonomy Table: [ 9231 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 9231 tips and 9230 internal nodes ]
# 02. Explore prevalence
## 02.1 Get prevalence
prevdf = apply(X = otu_table(physeq_qiime2),
MARGIN = ifelse(taxa_are_rows(physeq_qiime2), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
## 02.2 Add taxonomy
prevdf = data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(physeq_qiime2),
tax_table(physeq_qiime2))
## 02.3 Check prevalence at Phylum level
dfprev <- plyr::ddply(prevdf, "Phylum", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
## 02.4 Get genus prevalence plot
prevalence_genus = subset(prevdf, Genus %in% get_taxa_unique(physeq_qiime2, "Genus"))
prev_genus <- ggplot(prevalence_genus, aes(TotalAbundance,
Prevalence /nsamples(physeq_qiime2),color=Genus)) +
geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence") +
facet_wrap(~Phylum) + theme(legend.position="none")
#save prevalence plot
pdf("results/plots/03.diversity/01.Prevalence_Phylum_and_genus.pdf")
prev_genus
dev.off()
prev_genusfilterPhyla = c("GAL15", "Lastescibacterota")
physeq_filter_phyla = subset_taxa(physeq_qiime2, !Phylum %in% filterPhyla)
filterPhyla2 = c("Elusimicrobiota", "Cyanobacteria", "Dependentiae")
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Phylum %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Class %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Order %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Family %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Genus %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Species %in% filterPhyla2)
physeq_filter_phyla## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 9215 taxa and 83 samples ]
## sample_data() Sample Data: [ 83 samples by 8 sample variables ]
## tax_table() Taxonomy Table: [ 9215 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 9215 tips and 9214 internal nodes ]
## Get filter genus prevalence plot
prevalence_genus_filter = subset(prevdf, Genus %in% get_taxa_unique(physeq_filter_phyla, "Genus"))
prev_genus_filter <- ggplot(prevalence_genus_filter, aes(TotalAbundance,
Prevalence /nsamples(physeq_filter_phyla),color=Genus)) +
geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence") +
facet_wrap(~Phylum) + theme(legend.position="none")
#save filter prevalence plot
pdf("results/plots/03.diversity/02.Prevalence_Phylum_and_genus_filter.pdf")
prev_genus_filter
dev.off()
prev_genus_filterlibrary(ggplot2)
freq_samples <- plot_frequencies(sample_data(physeq_filter_phyla), "Location", "Source") + theme_bw()
pdf("results/plots/03.diversity/Number_of_Samples.pdf")
freq_samples <- ggplot(freq_samples$data, aes(x = Groups, y = n, fill = Factor)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Number of samples of Source and Location") +
theme_bw() + scale_color_brewer(palette = "Set2") + scale_fill_brewer(palette = "Set2")
dev.off()## png
## 2
freq_samples## 3.1 FASTER rarefaction curves with Vegan
mat <- as(t(otu_table(physeq_filter_phyla)), "matrix")
raremax <- min(rowSums(mat))
pdf("results/plots/03.diversity/03.Rarefaction_curves_vegan.pdf")
vegan_rarefaction_curves <- system.time(rarecurve(mat, step = 100, sample = raremax,
col = "blue", label = FALSE))
dev.off()
#Acumulation curves
library(ranacapa)
## 03.2 Get accumulation curves
acumulation_curves <- ggrare(physeq_filter_phyla, step = 100, color = "Source", label = "sample")
acumulation_curves
#Save at this point
save.image(file = "Dada2.RData")
load("Dada2.RData")
#custom plot
acumulation_curves_plot <- acumulation_curves + facet_wrap(~Source) +
labs(title="Accumulative curves") + theme_bw() +
scale_color_brewer(palette = "Set2") + scale_fill_brewer(palette = "Set2")
acumulation_curves_plot#save plot
pdf("results/plots/03.diversity/04.Rarefaction_curves_ranacapa_facewrapSource.pdf")
acumulation_curves_plot
dev.off()
#custom plot
acumulation_curves_plot_loc <- acumulation_curves + facet_wrap(~Location) +
labs(title="Accumulative curves") + theme_bw() +
scale_color_brewer(palette = "Set2") + scale_fill_brewer(palette = "Set2")
acumulation_curves_plot_locpdf("results/plots/03.diversity/04.Rarefaction_curves_ranacapa_facewrapLoc.pdf")
acumulation_curves_plot_loc
dev.off()
#Get diversity index
diversity_all_index <- microbiome::alpha(physeq_filter_phyla, index = "all")
#save table
write.table(diversity_all_index, "results/06.postprocess/all_alpha_index.tsv", col.names=NA, sep="\t")
location_colors= c("grey24","grey64","grey80")
alpha_source_and_location <-plot_richness(physeq_filter_phyla, color = "Location", x = "Location", measures = c("Observed", "Shannon")) +
geom_boxplot(aes(fill = Source), alpha=.7) +
scale_color_manual(values = location_colors) + scale_fill_brewer(palette = "Set2") +
theme_bw() +
theme(axis.text.x = element_text(size = 5, angle = 10))
ggsave("results/plots/03.diversity/05.Alpha_diversity_source-location.pdf", device = "pdf", dpi = 300, units = "px", width=3650, height=2035, bg = "white")
alpha_source_and_location# Bray NMDS
nmds_bray <- ordinate(physeq_filter_phyla, method = "NMDS", distance = "bray")
# Get stress value
var_stress_nmds_bray <- round(nmds_bray$stress, 5)
var_stress_nmds_bray## [1] 0.09511
#checks that the fit is good with shepard plot
stressplot(nmds_bray)#### ANOSIM
#extract metadata
metadata <- data.frame(phyloseq::sample_data(physeq_filter_phyla),
check.names = FALSE)
#get significant difference between location with anosim
anosim_source <- anosim(x= as.data.frame(t(otu_table(physeq_filter_phyla))),
grouping = metadata$Source,
permutations = 9999, distance = "bray")
anosim_location <- anosim(x= as.data.frame(t(otu_table(physeq_filter_phyla))),
grouping = metadata$Location,
permutations = 9999, distance = "bray")
anosim_source##
## Call:
## anosim(x = as.data.frame(t(otu_table(physeq_filter_phyla))), grouping = metadata$Source, permutations = 9999, distance = "bray")
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.1951
## Significance: 0.0001
##
## Permutation: free
## Number of permutations: 9999
anosim_location##
## Call:
## anosim(x = as.data.frame(t(otu_table(physeq_filter_phyla))), grouping = metadata$Location, permutations = 9999, distance = "bray")
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.8746
## Significance: 0.0001
##
## Permutation: free
## Number of permutations: 9999
nmds_bray_plot <- plot_ordination(physeq_filter_phyla, nmds_bray, label = "sample",
color = "Location", shape = "Source") + theme_bw() +
scale_fill_manual(values = location_colors) + scale_color_manual(values = location_colors)+
labs(col = "Location") + labs(title="NMDS, Bray-Curtis distance\nANOSIM Location: R=0.8746, p=0.0001, Permutations=9999") +
geom_point(size=3) + theme_bw()
nmds_bray_plot <- nmds_bray_plot +
annotate("text", x = Inf, y = -Inf, label = paste("Stress:", var_stress_nmds_bray),
hjust = 3.1, vjust = -1.9, size = 3)
nmds_bray_plotpdf("results/plots/03.diversity/06.NMDS_Shepard_Bray_Fit.pdf")
stressplot(nmds_bray)
dev.off()
pdf("results/plots/03.diversity/07.NMDS_Bray_Location.pdf")
nmds_bray_plot
dev.off()
nmds_bray_plot_2 <- plot_ordination(physeq_filter_phyla, nmds_bray, label = "sample",
color = "Source", shape = "Location") + theme_bw() +
scale_color_brewer(palette = "Set2")+ scale_fill_brewer(palette = "Set2")+
labs(col = "Source") + labs(title="NMDS, Bray-Curtis distance\nANOSIM Location: R=0.8746, p=0.0001, Permutations=9999") +
geom_point(size=3) + theme_bw()
nmds_bray_plot_2 <- nmds_bray_plot_2 +
annotate("text", x = Inf, y = -Inf, label = paste("Stress:", var_stress_nmds_bray),
hjust = 3.1, vjust = -1.9, size = 3)
pdf("results/plots/03.diversity/08.NMDS_Bray_Location_2.pdf")
nmds_bray_plot_2
dev.off()
nmds_bray_plot_2#Weigthed UniFrac take relative abundance and it is less sensitive to sample size
nmds_wunifrac <- ordinate(physeq_filter_phyla, method = "NMDS", distance = "wunifrac")
# stress variable
var_stress_nmds_wu <- round(nmds_wunifrac$stress, 5)
var_stress_nmds_wu## [1] 0.1118
stressplot(nmds_wunifrac)# checks that the fit is good# Weigthed UniFrac NMDS
nmds_wu <- plot_ordination(physeq_filter_phyla, nmds_wunifrac, label = "Sample",
color = "Location", shape = "Location") + theme_bw() +
scale_color_brewer(palette = "Set2")+ scale_fill_brewer(palette = "Set2")+
labs(col = "Location") + labs(title="NMDS, Weighted UniFrac distance") +
geom_point(size=3) + theme_bw()
nmds_wu <- nmds_wu +
annotate("text", x = Inf, y = -Inf, label = paste("Stress:", var_stress_nmds_wu),
hjust = 1.1, vjust = -1.1, size = 4)
pdf("results/plots/03.diversity/07.NMDS_Shepard_WUniFrac_Fit.pdf")
stressplot(nmds_wunifrac)
dev.off()
pdf("results/plots/03.diversity/08.NMDS_WUniFrac_Location.pdf")
nmds_wu
dev.off()
print(nmds_wu)### 0.6 abundances
library(ampvis2)
# Necesitamos extraer la tabla de read counts y la tabla de taxonomÃa del objeto physeq_filter_phyla
# Extraemos las tablas
otu_table_ampvis <- data.frame(OTU = rownames(phyloseq::otu_table(physeq_filter_phyla)@.Data),
phyloseq::otu_table(physeq_filter_phyla)@.Data,
phyloseq::tax_table(physeq_filter_phyla)@.Data,
check.names = FALSE
)
# Metadada
meta_data_ampvis <- data.frame(phyloseq::sample_data(physeq_filter_phyla),
check.names = FALSE
)
# cambiamos el indice por una columna que se llame SampleID
meta_data_ampvis <- meta_data_ampvis %>% rownames_to_column(var = "SampleID")
# ampvis object
av2 <- amp_load(otu_table_ampvis, meta_data_ampvis)
#ploteamos el objeto
pdf("results/plots//ampv_boxplot_abundance_location.pdf")
ampv_boxplot_abundance_location <- amp_boxplot(av2,
group_by = "Location",
sort_by = mean,
plot_type = "boxplot",
tax_show = 25,
tax_add = "Phylum",
normalise = T,
plot_log = T) +
scale_color_manual(values = location_colors) +
scale_fill_manual(values = location_colors)
dev.off()
ampv_boxplot_abundance_location## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 187 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
pdf("results/plots/ampv_boxplot_abundance_source.pdf")
ampv_boxplot_abundance_source <- amp_boxplot(av2,
group_by = "Source",
sort_by = mean,
plot_type = "boxplot",
tax_show = 20,
tax_add = "Phylum",
normalise = T,
plot_log = T) +
scale_color_brewer(palette = "Set2")+
scale_fill_brewer(palette = "Set2")
dev.off()
ampv_boxplot_abundance_source## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 159 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#rank abundance curves
pdf("results/plots/03.diversity/rank_abundance_curves.pdf")
rank_ab <- amp_rankabundance(av2, log10_x = T, group_by = "Source") +
scale_color_brewer(palette = "Set2") +
scale_fill_brewer(palette = "Set2")
dev.off()
rank_ab## Warning: Removed 1182 rows containing missing values or values outside the scale range
## (`geom_line()`).
#heatmap abundances
pdf("results/plots/03.diversity/ampv_heatmap_abundances30_genus.pdf")
ampv_heatmap_abundances_genus <- amp_heatmap(av2,
group_by = "Source",
facet_by = "Location",
plot_values = TRUE,
tax_show = 30,
tax_aggregate = "Genus",
tax_add = "Family",
plot_colorscale = "log10",
plot_legendbreaks = c(1, 5, 5),
color_vector = c("white","deepskyblue3","magenta3")
)
dev.off()
ampv_heatmap_abundances_genus## Warning in scale_fill_gradientn(colours = color.pal, trans = plot_colorscale, :
## log-10 transformation introduced infinite values.
pdf("results/plots/03.diversity/ampv_heatmap_abundances30_family.pdf")
ampv_heatmap_abundances_family <- amp_heatmap(av2,
group_by = "Source",
facet_by = "Location",
plot_values = TRUE,
tax_show = 30,
tax_aggregate = "Family",
tax_add = "Phylum",
plot_colorscale = "log10",
plot_legendbreaks = c(1, 5, 5),
color_vector = c("white","deepskyblue3","magenta3")
)
dev.off()
ampv_heatmap_abundances_family
pdf("results/plots/03.diversity/ampv_heatmap_abundances30_order.pdf")
ampv_heatmap_abundances_order <- amp_heatmap(av2,
group_by = "Source",
facet_by = "Location",
plot_values = TRUE,
tax_show = 35,
tax_aggregate = "Order",
tax_add = "Phylum",
plot_colorscale = "log10",
plot_legendbreaks = c(1, 5, 5),
color_vector = c("white","deepskyblue3","magenta3")
)
dev.off()
ampv_heatmap_abundances_order## Warning in scale_fill_gradientn(colours = color.pal, trans = plot_colorscale, :
## log-10 transformation introduced infinite values.